# pip install accelerate
from modelscope import AutoTokenizer, AutoModelForCausalLM
import torch

tokenizer = AutoTokenizer.from_pretrained("LLM-Research/gemma-2-2b-it", local_files_only=True)
model = AutoModelForCausalLM.from_pretrained(
    "LLM-Research/gemma-2-2b-it",
    device_map="auto",
    torch_dtype=torch.bfloat16,
    local_files_only=True
)

input_text = ("请说一下勾股定理")
input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")

outputs = model.generate(**input_ids, max_new_tokens=1024)
print(tokenizer.decode(outputs[0]))
